---
title: "Summary statistics"
author: "Mar and Dave"
date: "7/22/2022"
output:
  html_document: default
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
library(tidyverse)
library(gt)
library(hms)
library(lubridate)

indo_dt <- readRDS(file = "/home/ubuntu/data/shared_folder/Data/20220729_indo_dt.rds") %>%
  mutate(contains_image = nchar(media_url) > 2)

stats_df <- indo_dt %>% 
  select(is_original, papuanlivesmatter, koman, bin_nugraha, nkri, teroris_kkb, UNassembly, ulmwp_wenda, FaktadiPapua, nduga, otsus, rasisme, zanambani, author_id, engagement, queries_total, contains_image)

queries <- list(
  papuanlivesmatter = "#papuanlivesmatter",
  koman = "Repayment of scholarship by activist Veronica Koman",
  bin_nugraha = "Killing of State Intelligence Agency representative in Papua",
  nkri = "Unitary State of the Republic of Indonesia (NKRI)",
  teroris_kkb = "Description of independence movement as “terrorists” or “armed criminal group”",
  UNassembly = "UN General Assembly",
  ulmwp_wenda = "United Liberation Movement for West Papua or its leader, Benny Wenda",
  FaktadiPapua = "#FaktadiPapua (the Facts in Papua)",
  nduga = "Nduga killings and counter-insurgency response",
  otsus = "Special autonomy (otsus)",
  rasisme="Racism",
  zanambani = "Murder of Christian Pastor Yeremia Zanambani"
)

```

# Percentage of tweets picked up by more than one search term:
```{r}
(stats_df %>% filter(queries_total > 1) %>% nrow())/(stats_df %>% nrow())
```

# Number of tweets picked up by each search term:
```{r}
stats_df %>% summarise(across(names(queries), sum))
```

## Function to extract summary satistics for each query
```{r}
# Note that each tweet may correspond to more than one query, which is why we need a logical column for each query
extractStats <- function(df) {
  list(
    no_tweets = nrow(df),
    no_distinct_authors = length(unique(df$author_id)),
    mean_tweets_per_author = df %>% group_by(author_id) %>% summarise(n=n()) %>% pull(n) %>% mean(),
    prop_original = df %>% pull(is_original) %>% mean(),
    mean_engagement_tweet = df %>% pull(engagement) %>% mean(),
    prop_no_engagement = mean(df %>% filter(is_original) %>% pull(engagement) == 0),
    #prop_no_engagement = 100*mean(df %>% pull(engagement) == 0),
    prop_images = df %>% pull(contains_image) %>% mean()
  )
}

stats_columns = c("mean_tweets_per_author", "prop_original", "mean_engagement_tweet", "prop_no_engagement", "prop_images")
```

# Prepare the table with summary statistics for each query, and overall
```{r}
table <- data.frame(no_tweets = numeric(), no_distinct_authors = numeric(), mean_tweets_per_author = numeric(), prop_original = numeric(), mean_engagement_tweet = numeric(), prop_no_engagement= numeric(), prop_images = numeric())

for (query in names(queries)){
  df <- stats_df[stats_df[query]==TRUE,]
  table[query,] <- extractStats(df)
}

max_values <- table %>% summarise(across(everything(), max))
min_values <- table %>% summarise(across(everything(), min))

table["overall",] <- data.frame(extractStats(stats_df))
queries["overall"] = "Overall"
```

# Generate table using gt() package
```{r}
table %>%
  mutate(names = queries[row.names(table)]) %>%
  gt(rowname_col = "names") %>%
  cols_label(
    no_tweets = "Number of tweets",
    no_distinct_authors = "Number of distinct authors",
    mean_tweets_per_author = "Mean tweets per author",
    prop_original = "Percentage of original tweets",
    mean_engagement_tweet = "Mean engagement per tweet",
    prop_no_engagement = "Percentage of original tweets with zero engagement",
    prop_images = "Percentage of tweets that contain images"
  ) %>%
  fmt_percent(
    columns = starts_with("prop"),
    decimals = 0
  ) %>%
  fmt_number(
    columns = starts_with("no"),
    sep_mark = ",",
    decimals = 0
  ) %>%
  fmt_number(
    columns = starts_with("mean"),
    n_sigfig = 2 #Set 2 significant decimal digits
  ) %>%
  tab_style(
    style = cell_text(weight = "bold"),
    locations = list(cells_column_labels(), cells_stub(rows = "Overall")) # Bold column labels and Overall
  ) %>% # Bold max value for each column
  purrr::reduce(stats_columns, function(x, y) {
    tab_style(x, 
              style = cell_text(weight = "bold"),
              locations = cells_body(columns = all_of(y), rows = !!sym(y) == max_values[[y]]) 
    )  
  }, .init = .) %>% # Underline min value for each column
  purrr::reduce(stats_columns, function(x, y) {
    tab_style(x, 
              style = cell_text(decorate = "underline"),
      locations = cells_body(columns = all_of(y), rows = !!sym(y) == min_values[[y]])
    )  
  }, .init = .) %>% 
  tab_style(
    style = cell_text(align = "left", indent = 0),
    locations = list(cells_column_labels(), cells_stub(), cells_body())
  ) %>%
  opt_table_lines(extent = "none")
```

# Plot the distribution of tweet creation over time, facet by whether tweet is original
```{r}
indo_dt %>%
  mutate(date_bins = floor_date(jkt_date_time, "month", week_start = getOption("lubridate.week.start", 7)),
         is_original_plot = ifelse(is_original, "Original tweet", "Retweets/Quotes/Replies")) %>%
  ggplot(aes(date_bins, fill=is_original_plot)) +
  geom_bar(position = "dodge") +
  scale_y_continuous(expand = c(0, 0))+
  scale_x_datetime(date_breaks = "month", date_labels = "%b %Y", expand = c(0,0)) +
  labs(x = "", y = "Number of tweets", fill= "") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90, vjust=0.5), panel.grid.minor = element_blank(), 
        panel.grid.major.x = element_blank(), axis.title.x = element_blank(), legend.position=c(.25,.85))


ggsave(file = "figures/7_tweet_created_plot.png", width=2400, height=1200, units="px")
```

